{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\haha\\AppData\\Local\\Temp\\ipykernel_58236\\2087404159.py:11: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.\n",
      "  a = pd.read_html(html)\n"
     ]
    },
    {
     "ename": "ImportError",
     "evalue": "Missing optional dependency 'lxml'.  Use pip or conda to install lxml.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\compat\\_optional.py:135\u001b[0m, in \u001b[0;36mimport_optional_dependency\u001b[1;34m(name, extra, errors, min_version)\u001b[0m\n\u001b[0;32m    134\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 135\u001b[0m     module \u001b[38;5;241m=\u001b[39m \u001b[43mimportlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    136\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\importlib\\__init__.py:90\u001b[0m, in \u001b[0;36mimport_module\u001b[1;34m(name, package)\u001b[0m\n\u001b[0;32m     89\u001b[0m         level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m---> 90\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1387\u001b[0m, in \u001b[0;36m_gcd_import\u001b[1;34m(name, package, level)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1360\u001b[0m, in \u001b[0;36m_find_and_load\u001b[1;34m(name, import_)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1310\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[1;34m(name, import_)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:488\u001b[0m, in \u001b[0;36m_call_with_frames_removed\u001b[1;34m(f, *args, **kwds)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1387\u001b[0m, in \u001b[0;36m_gcd_import\u001b[1;34m(name, package, level)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1360\u001b[0m, in \u001b[0;36m_find_and_load\u001b[1;34m(name, import_)\u001b[0m\n",
      "File \u001b[1;32m<frozen importlib._bootstrap>:1324\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[1;34m(name, import_)\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'lxml'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 11\u001b[0m\n\u001b[0;32m      8\u001b[0m html \u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mtext\n\u001b[0;32m     10\u001b[0m \u001b[38;5;66;03m# 解析网页\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m a \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_html\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhtml\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     12\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(a)\n\u001b[0;32m     14\u001b[0m \u001b[38;5;66;03m# 保存数据\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\html.py:1240\u001b[0m, in \u001b[0;36mread_html\u001b[1;34m(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend, storage_options)\u001b[0m\n\u001b[0;32m   1224\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28many\u001b[39m(\n\u001b[0;32m   1225\u001b[0m     [\n\u001b[0;32m   1226\u001b[0m         is_file_like(io),\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1230\u001b[0m     ]\n\u001b[0;32m   1231\u001b[0m ):\n\u001b[0;32m   1232\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m   1233\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPassing literal html to \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mread_html\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m is deprecated and \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1234\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwill be removed in a future version. To read from a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1237\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m   1238\u001b[0m     )\n\u001b[1;32m-> 1240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_parse\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1241\u001b[0m \u001b[43m    \u001b[49m\u001b[43mflavor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mflavor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1242\u001b[0m \u001b[43m    \u001b[49m\u001b[43mio\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1243\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1244\u001b[0m \u001b[43m    \u001b[49m\u001b[43mheader\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1245\u001b[0m \u001b[43m    \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1246\u001b[0m \u001b[43m    \u001b[49m\u001b[43mskiprows\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskiprows\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1247\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparse_dates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_dates\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1248\u001b[0m \u001b[43m    \u001b[49m\u001b[43mthousands\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthousands\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1249\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattrs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattrs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1250\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1251\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdecimal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecimal\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1252\u001b[0m \u001b[43m    \u001b[49m\u001b[43mconverters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconverters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1253\u001b[0m \u001b[43m    \u001b[49m\u001b[43mna_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1254\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkeep_default_na\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_default_na\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1255\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdisplayed_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisplayed_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1256\u001b[0m \u001b[43m    \u001b[49m\u001b[43mextract_links\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextract_links\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1257\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtype_backend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1258\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\html.py:971\u001b[0m, in \u001b[0;36m_parse\u001b[1;34m(flavor, io, match, attrs, encoding, displayed_only, extract_links, storage_options, **kwargs)\u001b[0m\n\u001b[0;32m    969\u001b[0m retained \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m    970\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m flav \u001b[38;5;129;01min\u001b[39;00m flavor:\n\u001b[1;32m--> 971\u001b[0m     parser \u001b[38;5;241m=\u001b[39m \u001b[43m_parser_dispatch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflav\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    972\u001b[0m     p \u001b[38;5;241m=\u001b[39m parser(\n\u001b[0;32m    973\u001b[0m         io,\n\u001b[0;32m    974\u001b[0m         compiled_match,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    979\u001b[0m         storage_options,\n\u001b[0;32m    980\u001b[0m     )\n\u001b[0;32m    982\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\html.py:918\u001b[0m, in \u001b[0;36m_parser_dispatch\u001b[1;34m(flavor)\u001b[0m\n\u001b[0;32m    916\u001b[0m     import_optional_dependency(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbs4\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    917\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 918\u001b[0m     \u001b[43mimport_optional_dependency\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlxml.etree\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m    919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _valid_parsers[flavor]\n",
      "File \u001b[1;32mc:\\Users\\haha\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\compat\\_optional.py:138\u001b[0m, in \u001b[0;36mimport_optional_dependency\u001b[1;34m(name, extra, errors, min_version)\u001b[0m\n\u001b[0;32m    136\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m    137\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 138\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(msg)\n\u001b[0;32m    139\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m    141\u001b[0m \u001b[38;5;66;03m# Handle submodules: if we have submodule, grab parent module from sys.modules\u001b[39;00m\n",
      "\u001b[1;31mImportError\u001b[0m: Missing optional dependency 'lxml'.  Use pip or conda to install lxml."
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "\n",
    "# 获取响应\n",
    "url = 'http://news.sina.com.cn/zt_d/qswur2020/'\n",
    "r = requests.get(url)\n",
    "r.encoding = r.apparent_encoding\n",
    "html = r.text\n",
    "\n",
    "# 解析网页\n",
    "a = pd.read_html(html)\n",
    "df = pd.concat(a)\n",
    "\n",
    "# 保存数据\n",
    "df.to_csv('./世界大学QS排名3.csv',index=False,encoding='utf-8')\n",
    "print('写入成功')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "\n",
    "# 获取数据\n",
    "def get_page(url):\n",
    "    try:\n",
    "        r = requests.get(url, headers=headers)\n",
    "        if r.status_code == 200:\n",
    "            return r.json()\n",
    "    except requests.ConnectionError as e:\n",
    "        print(e)\n",
    "\n",
    "\n",
    "# 解析数据\n",
    "def parser_page(json):\n",
    "    if json:\n",
    "        items = json.get(\"data\")\n",
    "        for i in range(len(items)):\n",
    "            item = items[i]\n",
    "            qsrank = {}\n",
    "            if \"=\" in item[\"rank_display\"]:\n",
    "                rk_str = str(item[\"rank_display\"]).split(\"=\")[-1]\n",
    "                qsrank[\"rank_display\"] = rk_str\n",
    "            else:\n",
    "                qsrank[\"rank_display\"] = item[\"rank_display\"]\n",
    "            qsrank[\"title\"] = item[\"title\"]\n",
    "            qsrank[\"region\"] = item[\"region\"]\n",
    "            qsrank[\"score\"] = item[\"score\"]\n",
    "            # url可根据需求提取\n",
    "            qsrank[\"url\"] = item[\"url\"]\n",
    "            yield qsrank\n",
    "\n",
    "\n",
    "# 主函数\n",
    "def main():\n",
    "    url = 'https://www.qschina.cn/sites/default/files/qs-rankings-data/cn/2208747_indicators.txt'\n",
    "    path = \"./1.txt\"\n",
    "    \n",
    "    json = get_page(url)\n",
    "    results = parser_page(json)\n",
    "\n",
    "    for result in results:\n",
    "        with open(path, \"a\") as f:  # path 是文件要存储的地方\n",
    "            f.write(\n",
    "                \"{:10}{:50}{:^88}{:>}\\n\".format(\n",
    "                    result[\"rank_display\"],\n",
    "                    result[\"title\"],\n",
    "                    result[\"region\"],\n",
    "                    result[\"score\"],\n",
    "                )\n",
    "            )\n",
    "            f.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
